import pandas as pd

# ========== 配置 ==========
input_path = r"C:\Users\Lenovo\Desktop\000\data\公众奖励模型训练数据.csv"
output_path = r"C:\Users\Lenovo\Desktop\000\价值维度分布_公众.xlsx"
# ==========================

# 维度关键词字典（公众版示例）
dim_dict = {
    "文化传承与历史保护": ["文化", "历史"],
    "配套设施改善": ["提升", "改造", "配套", "融合", "升级", "改善", "更新", "保护", "整合"],
    "整体发展与宜居生活": ["发展", "规划", "品质", "生活", "居住", "宜居", "活力", "绿化", "生活品质"],
    "空间利用与生态环境": ["用地", "空间", "生态", "园区", "土地", "片区", "空间布局", "环境", "景观"],
    "公共服务与社区生活": ["城市", "社区", "居民", "公共服务", "交通", "街区"],
    "土地与资源效率": ["优化", "利用效率", "效率"],
    "产业经济与创新发展": ["产业", "商业", "创新", "科技", "产业园", "企业", "产业结构", "工业"]
}

# 1. 读取数据
df = pd.read_csv(input_path)

# 2. 选出优选方案
df["best_plan"] = df.apply(lambda x: x["方案1"] if x["labels"] == 1 else x["方案2"], axis=1)

# 3. 给方案打标签
def match_dimensions(text, dim_dict):
    matched_dims = []
    for dim, keywords in dim_dict.items():
        for kw in keywords:
            if kw in str(text):
                matched_dims.append(dim)
                break  # 避免同一维度重复计数
    return matched_dims

df["匹配维度"] = df["best_plan"].apply(lambda x: match_dimensions(x, dim_dict))

# 4. 统计维度分布
all_dims = []
for dims in df["匹配维度"]:
    all_dims.extend(dims)

dim_count = pd.Series(all_dims).value_counts().reset_index()
dim_count.columns = ["维度", "出现次数"]

# 计算占比
dim_count["占比(%)"] = dim_count["出现次数"] / dim_count["出现次数"].sum() * 100

# 5. 保存结果
dim_count.to_excel(output_path, index=False)

print("统计完成！结果已保存到：", output_path)
